import os
import numpy as np
import pandas as pd
from glob import glob
import shutil
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## progressbar
import progressbar
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
from matplotlib.colors import LinearSegmentedColormap
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from matplotlib import cm
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
Training neural networks for automated diagnosis of pigmented skin lesions is hampered by the small size and lack of diversity of available datasets of dermatoscopic images. We tackle this problem by releasing the HAM10000 ("Human Against Machine with 10000 training images") dataset. We collected dermatoscopic images from different populations, acquired and stored by different modalities. The final dataset consists of 10015 dermatoscopic images which can serve as a training set for academic machine learning purposes. Cases include a representative collection of all important diagnostic categories in the realm of pigmented lesions: Actinic keratoses and intraepithelial carcinoma / Bowen's disease (akiec), basal cell carcinoma (bcc), benign keratosis-like lesions (solar lentigines / seborrheic keratoses and lichen-planus like keratoses, bkl), dermatofibroma (df), melanoma (mel), melanocytic nevi (nv) and vascular lesions (angiomas, angiokeratomas, pyogenic granulomas and hemorrhage, vasc).
More than 50% of lesions are confirmed through histopathology (histo), the ground truth for the rest of the cases is either follow-up examination (follow_up), expert consensus (consensus), or confirmation by in-vivo confocal microscopy (confocal). The dataset includes lesions with multiple images, which can be tracked by the lesion_id-column within the HAM10000_metadata file.
Due to upload size limitations, images are stored in two files:
* HAM10000_images_part1.zip (5000 JPEG files)
* HAM10000_images_part2.zip (5015 JPEG files)
The HAM10000 dataset served as the training set for the ISIC 2018 challenge (Task 3). The test-set images are available herein as ISIC2018_Task3_Test_Images.zip (1511 images), the official validation-set is available through the challenge website https://challenge2018.isic-archive.com/. The ISIC-Archive also provides a "Live challenge" submission site for continuous evaluation of automated classifiers on the official validation- and test-set.
Test-set evaluations of the ISIC 2018 challenge were compared to physicians on an international scale, where the majority of challenge participants outperformed expert readers: Tschandl P. et al., Lancet Oncol 2019
The test-set images were also used in a study comparing different methods and scenarios of human-computer collaboration: Tschandl P. et al., Nature Medicine 2020 Following corresponding metadata is available herein:
Base = 'slatmd'
ImageID_path_dict = {os.path.splitext(os.path.basename(x))[0]: x for x in glob(os.path.join(Base, '*', '*.jpg'))}
Lesion_dict = {'akiec': 'Actinic Keratoses and Intraepithelial Carcinoma',
'bcc': 'Basal Cell Carcinoma',
'bkl': 'Benign Keratosis-like Lesions',
'df': 'Dermatofibroma',
'mel': 'Melanoma',
'nv': 'Melanocytic Nevi',
'vasc': 'Vascular Lesions'}
Data = pd.read_csv(os.path.join(Base, 'HAM10000_metadata.csv'))
Data['path'] = Data['image_id'].map(ImageID_path_dict)
Data['dx_full'] = Data['dx'].map(Lesion_dict)
Data[['dx_type', 'sex','localization']] = Data[['dx_type', 'sex', 'localization']].applymap(lambda x: x.replace('_',' ').title())
Data.head(10)
| lesion_id | image_id | dx | dx_type | age | sex | localization | path | dx_full | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | HAM_0000118 | ISIC_0027419 | bkl | Histo | 80.0 | Male | Scalp | slatmd\HAM10000_images_part_1\ISIC_0027419.jpg | Benign Keratosis-like Lesions |
| 1 | HAM_0000118 | ISIC_0025030 | bkl | Histo | 80.0 | Male | Scalp | slatmd\HAM10000_images_part_1\ISIC_0025030.jpg | Benign Keratosis-like Lesions |
| 2 | HAM_0002730 | ISIC_0026769 | bkl | Histo | 80.0 | Male | Scalp | slatmd\HAM10000_images_part_1\ISIC_0026769.jpg | Benign Keratosis-like Lesions |
| 3 | HAM_0002730 | ISIC_0025661 | bkl | Histo | 80.0 | Male | Scalp | slatmd\HAM10000_images_part_1\ISIC_0025661.jpg | Benign Keratosis-like Lesions |
| 4 | HAM_0001466 | ISIC_0031633 | bkl | Histo | 75.0 | Male | Ear | slatmd\HAM10000_images_part_2\ISIC_0031633.jpg | Benign Keratosis-like Lesions |
| 5 | HAM_0001466 | ISIC_0027850 | bkl | Histo | 75.0 | Male | Ear | slatmd\HAM10000_images_part_1\ISIC_0027850.jpg | Benign Keratosis-like Lesions |
| 6 | HAM_0002761 | ISIC_0029176 | bkl | Histo | 60.0 | Male | Face | slatmd\HAM10000_images_part_1\ISIC_0029176.jpg | Benign Keratosis-like Lesions |
| 7 | HAM_0002761 | ISIC_0029068 | bkl | Histo | 60.0 | Male | Face | slatmd\HAM10000_images_part_1\ISIC_0029068.jpg | Benign Keratosis-like Lesions |
| 8 | HAM_0005132 | ISIC_0025837 | bkl | Histo | 70.0 | Female | Back | slatmd\HAM10000_images_part_1\ISIC_0025837.jpg | Benign Keratosis-like Lesions |
| 9 | HAM_0005132 | ISIC_0025209 | bkl | Histo | 70.0 | Female | Back | slatmd\HAM10000_images_part_1\ISIC_0025209.jpg | Benign Keratosis-like Lesions |
def DatasetDist(Table, Target, PD):
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, column_widths=PD['column_widths'],
specs=[[{"type": "table"},{"type": "pie"}]])
# Right
fig.add_trace(go.Pie(labels=Table[Target].values, values=Table['Count'].values,
pull=PD['pull'], textfont=dict(size= PD['textfont']),
marker=dict(colors = PD['PieColors'], line=dict(color='black', width=1))), row=1, col=2)
fig.update_traces(hole=PD['hole'])
fig.update_layout(height = PD['height'], legend=dict(orientation=PD['legend_orientation']),
legend_title_text= PD['legend_title'])
# Left
T = Table.copy()
T['Percentage'] = T['Percentage'].map(lambda x: '%%%.2f' % x)
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= PD['TableColors'][0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = PD['tablecolumnwidth'],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [PD['TableColors'][1], PD['TableColors'][1]]),
align=['center','center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(title={'text': '<b>' + Target + '<b>', 'x':PD['title_x'],
'y':PD['title_y'], 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
def DistPlot(Feat, Target, PD, Inp = Data):
fig = px.histogram(Inp, x = Feat, nbins=PD['nbins'], color= Target, marginal = PD['marginal'],
color_discrete_sequence= PD['Bar_Colors'], hover_data=Data.columns)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray')
# Median
fig.add_trace(go.Scatter(x= Inp[Feat].median()* np.ones(int(PD['ylims'][1])),
y= np.arange(int(PD['ylims'][0]), int(PD['ylims'][1])),
name="Median", line=dict(color='RoyalBlue', width=2, dash='dot')))
# Mean
fig.add_trace(go.Scatter(x= Inp[Feat].mean()* np.ones(int(PD['ylims'][1])),
y= np.arange(int(PD['ylims'][0]), int(PD['ylims'][1])),
name="Mean", line=dict(color='Red', width=2, dash='dot')))
Name = '%s Distribution by %s' % (Target, Feat)
fig.update_layout(title={'text': '<b>' + PD['Title'] + '<b>', 'x':PD['title_x'],
'y':PD['title_y'], 'xanchor': 'center', 'yanchor': 'top'}, yaxis_title='Frequency',
plot_bgcolor= 'white', height = PD['height'],
legend=dict(orientation=PD['legend_orientation'], x=PD['legend_x'], y=PD['legend_y']),
legend_title_text= PD['legend_title'])
fig.update_traces(marker_line_color= PD['line_color'], marker_line_width=0.5, opacity=1)
fig['layout']['yaxis'].update(range=PD['ylims'])
fig.show()
Feat = 'dx_full'
Name = 'Lesion'
Table = Data[Feat].value_counts().to_frame('Count').reset_index(drop = False).rename(columns = {'index':Name})
Table['Percentage'] = np.round(100*(Table['Count']/Table['Count'].sum()),2)
Pull = [0.01 for x in range((len(Table['Lesion'])-1))]
Pull.append(.05)
PD = dict(PieColors = px.colors.sequential.RdPu_r, TableColors = ['DarkBlue','GhostWhite'], hole = .4,
column_widths=[0.6, 0.4], textfont = 14, height = 450, tablecolumnwidth = [.1, .02, .025],
pull = Pull, legend_title = 'Lesion', legend_orientation = 'h', title_x = 0.5, title_y = 0.85)
del Pull
DatasetDist(Table, Target = Name, PD = PD)
PD = dict(nbins = 20, Bar_Colors = px.colors.sequential.RdPu_r ,
marginal='box', line_color = 'Black', ylims = [0, 1400],
Title = 'Lesion and Age', title_x = 0.5, title_y = .95, height = 700,
legend_title = 'Lesion', legend_orientation = 'h', legend_x = 0.01, legend_y = -0.2)
DistPlot('age', Target = 'dx_full', PD = PD)
Feat = 'dx_type'
Name = 'Lesion Type'
Table = Data[Feat].value_counts().to_frame('Count').reset_index(drop = False).rename(columns = {'index':Name})
Table['Percentage'] = np.round(100*(Table['Count']/Table['Count'].sum()),2)
Pull = [0.01 for x in range((len(Table[Name])-1))]
Pull.append(.05)
PD = dict(PieColors = px.colors.sequential.Magma_r, TableColors = ['DarkRed','MistyRose'], hole = .4,
column_widths=[0.5, 0.5],textfont = 14, height = 400, tablecolumnwidth = [.1, .05, .08],
pull = Pull, legend_title = Name, legend_orientation = 'v', title_x = 0.5, title_y = 0.85)
del Pull
DatasetDist(Table, Target = Name, PD = PD)
PD = dict(nbins = 20, Bar_Colors = px.colors.sequential.Magma_r,
marginal='box', line_color = 'Black', ylims = [0, 1400],
Title = 'Lesion and Lesion Type', title_x = 0.5, title_y = .95, height = 500,
legend_title = 'Lesion', legend_orientation = 'h', legend_x = 0.01, legend_y = -0.2)
DistPlot('age', Target = Feat, PD = PD)
Feat = 'localization'
Name = 'Lesion Localization'
Table = Data[Feat].value_counts().to_frame('Count').reset_index(drop = False)\
.rename(columns = {'index':Name})
Table['Percentage'] = np.round(100*(Table['Count']/Table['Count'].sum()),2)
Pull = [0.01 for x in range((len(Table[Name])-1))]
Pull.append(.05)
PD = dict(PieColors = px.colors.sequential.Rainbow, TableColors = ['DarkSlateGray','HoneyDew'], hole = .4,
column_widths=[0.4, 0.4],textfont = 14, height = 550, tablecolumnwidth = [.1, .05, .08],
pull = Pull, legend_title = Name, legend_orientation = 'h', title_x = 0.5, title_y = 0.9)
del Pull
DatasetDist(Table, Target = Name, PD = PD)
PD = dict(nbins = 20, Bar_Colors = px.colors.sequential.Aggrnyl_r,
marginal=None, line_color = 'Black', ylims = [0, 1400],
Title = 'Lesion and Lesion Type', title_x = 0.5, title_y = .95, height = 500,
legend_title = 'Lesion', legend_orientation = 'h', legend_x = 0.01, legend_y = -0.2)
DistPlot('age', Target = Feat, PD = PD)
Feat = 'sex'
Name = 'Gender'
Table = Data[Feat].value_counts().to_frame('Count').reset_index(drop = False).rename(columns = {'index':Name})
Table['Percentage'] = np.round(100*(Table['Count']/Table['Count'].sum()),2)
Table[Name] = Table[Name].map(lambda x: x.replace('_',' ').title())
Pull = [0.01 for x in range((len(Table[Name])-1))]
Pull.append(.05)
PD = dict(PieColors = ['RoyalBlue','DeepPink','LawnGreen'], TableColors = ['SlateGray','AliceBlue'], hole = .4,
column_widths=[0.5, 0.5],textfont = 14, height = 400, tablecolumnwidth = [.1, .05, .08],
pull = Pull, legend_title = Name, legend_orientation = 'v', title_x = 0.5, title_y = 0.85)
del Pull
DatasetDist(Table, Target = Name, PD = PD)
PD = dict(nbins = 20, Bar_Colors = ['RoyalBlue','DeepPink','LawnGreen'],
marginal='violin', line_color = 'Black', ylims = [0, 1400],
Title = 'Lesion and Lesion Type', title_x = 0.5, title_y = .95, height = 600,
legend_title = 'Lesion', legend_orientation = 'h', legend_x = 0.01, legend_y = -0.2)
DistPlot('age', Target = Feat, PD = PD)
Note that there are two subfolders in the directory and both have distinct sets of pictures. Since,
FilesInfo = Data[['image_id','path','dx']]
FilesInfo['folder'] = FilesInfo['path'].map(lambda x: x.split('\\')[1])
FilesInfo['file'] = FilesInfo['path'].map(lambda x: x.split('\\')[-1])
SubFolders = FilesInfo['folder'].unique()
print('Subfolders: %s' % ', '.join(SubFolders.tolist()))
del Data
Subfolders: HAM10000_images_part_1, HAM10000_images_part_2
def List_Intersect(List1, List2):
return list(set(List1) & set(List2))
List_Intersect(FilesInfo.loc[FilesInfo['folder'] == SubFolders[0], 'file'].tolist(),
FilesInfo.loc[FilesInfo['folder'] == SubFolders[1], 'file'].tolist())
[]
Copying all files into a temporary directory.
# Creating a new directory
NewDire = Base+'_mod'
if os.path.exists(NewDire):
shutil.rmtree(NewDire)
# Creating sub-directories
for subfolder in FilesInfo['dx'].unique().tolist():
if not os.path.exists(os.path.join(NewDire, subfolder)):
os.makedirs(os.path.join(NewDire, subfolder))
Counter = 0
Progress_Bar = progressbar.ProgressBar(maxval=FilesInfo.shape[0],
widgets=[progressbar.Bar('#', '|', '|'), progressbar.Percentage()])
Progress_Bar.start()
for _, row in FilesInfo.iterrows():
# from the current dir to a new one
shutil.copy(row['path'], os.path.join(NewDire, row['dx'], row['file']))
Progress_Bar.update(Counter)
Counter+=1
Progress_Bar.finish()
|#########################################################################|100%
Now,
def Path_Tree(PATH, Extension):
Out = {}
sep = ' ' * 3
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
title = PATH.split('\\')[-1]
print(Style.RESET_ALL + Fore.BLUE + Style.NORMAL + '=' * (len(title) +1) + Style.RESET_ALL)
print(Back.BLACK + Fore.CYAN + Style.NORMAL + title+':'+ Style.RESET_ALL)
print(Style.RESET_ALL + Fore.BLUE + Style.NORMAL + '=' * (len(title) +1)+ Style.RESET_ALL)
i = 0
C = ['Red', 'Green', 'Yellow', 'Blue', 'Magenta', 'Cyan']*len(os.listdir(PATH))
for entry in os.listdir(PATH):
if os.path.isdir(os.path.join (PATH, entry)):
print('└──',BACK[C[i]] + Fore.BLACK + Style.NORMAL + entry+':'+ Style.RESET_ALL)
Sub = os.path.join (PATH, entry)
List = os.listdir(Sub)
List = [x for x in List if x.endswith(Extension)]
Out[entry] = List
print(2* sep, Fore.BLUE + Style.NORMAL +
'%i %s files:' % (len(List), List[0].split('.')[-1].upper()) + Style.RESET_ALL)
print(2* sep, ', '.join(List[:5]) + ', ...')
i+=1
return Out
_ = Path_Tree(NewDire, '.jpg')
=========== slatmd_mod: =========== └── akiec: 327 JPG files: ISIC_0024329.jpg, ISIC_0024372.jpg, ISIC_0024418.jpg, ISIC_0024450.jpg, ISIC_0024463.jpg, ... └── bcc: 514 JPG files: ISIC_0024331.jpg, ISIC_0024332.jpg, ISIC_0024345.jpg, ISIC_0024360.jpg, ISIC_0024403.jpg, ... └── bkl: 1099 JPG files: ISIC_0024312.jpg, ISIC_0024324.jpg, ISIC_0024336.jpg, ISIC_0024337.jpg, ISIC_0024338.jpg, ... └── df: 115 JPG files: ISIC_0024318.jpg, ISIC_0024330.jpg, ISIC_0024386.jpg, ISIC_0024396.jpg, ISIC_0024553.jpg, ... └── mel: 1113 JPG files: ISIC_0024310.jpg, ISIC_0024313.jpg, ISIC_0024315.jpg, ISIC_0024323.jpg, ISIC_0024333.jpg, ... └── nv: 6705 JPG files: ISIC_0024306.jpg, ISIC_0024307.jpg, ISIC_0024308.jpg, ISIC_0024309.jpg, ISIC_0024311.jpg, ... └── vasc: 142 JPG files: ISIC_0024370.jpg, ISIC_0024375.jpg, ISIC_0024402.jpg, ISIC_0024475.jpg, ISIC_0024662.jpg, ...